
import logging
from init import PATHS
LOGGER = logging.getLogger(__name__)
import pandas as pd


def import_concordance():
    concordance = pd.read_csv(PATHS.rawdata / 'GreenCodes/cpc-ipc-concordance.txt', sep='\t', header=None)
    concordance = concordance.rename(columns={0: 'CPCfull', 2: 'IPCfull', 4: 'level'})
    concordance['CPCgroup'] = concordance['CPCfull'].apply(lambda x: x.split('/')[0])
    concordance['CPCsubclass'] = concordance['CPCgroup'].str[:4]
    concordance['CPCclass'] = concordance['CPCgroup'].str[:3]
    concordance['IPCgroup'] = concordance['IPCfull'].apply(lambda x: x.split('/')[0])
    concordance['IPCsubclass'] = concordance['IPCgroup'].str[:4]
    concordance['IPCclass'] = concordance['IPCgroup'].str[:3]
    concordance = concordance.drop([1, 3], axis=1)
    return concordance


def GetAllSubGroupCodes(codes_lit, concordance):
    # subgroupcode = 'B60K1/04'
    # code = 'B60K1'
    # code = 'B60'
    df_chunks = []
    for i, code in enumerate(codes_lit['Code'].tolist()):
        # In our excel spreadsheet, the codes are listed at various levels, i.e. subgroup, group, subclass or class level
        # First, deal with the cases where the code is at the group, subclass or class level.
        chunks = GetCodes_in_ClassGroup(code, concordance)
        df_chunks.append(chunks)
        # Next, deal with cases where the code is at subgroup level
        # In these cases, we get all the subgroup codes embedded in that subgroup by using information in the column "level" of concordance
        if code in concordance['CPCfull'].tolist():
            chunk = GetAllSubgroups_basedonlevel(code, concordance, scheme='CPC')
            chunk['Code'] = code
            df_chunks.append(chunk)
        if code in concordance['IPCfull'].tolist():
            chunk = GetAllSubgroups_basedonlevel(code, concordance, scheme='IPC')
            chunk['Code'] = code
            df_chunks.append(chunk)
    df_chunks = pd.concat(df_chunks).drop_duplicates()
    df_chunks = df_chunks.reset_index().drop(['index'], axis=1)
    return df_chunks


def GetCodes_in_ClassGroup(code, concordance):
    # if the code is at class/subclass/group level, get all the codes embedded in it.
    chunks = []
    for scheme in ['CPC', 'IPC']:
        for hierarc in ['group', 'subclass', 'class']:
            minichunk = concordance[concordance['{}{}'.format(scheme, hierarc)] == code]
            if minichunk.shape[0] > 0:
                # check if minichunk is empty or not.
                # if not empty, append all the subgroups
                for subgroupcode in minichunk['{}full'.format(scheme)].tolist():
                    chunks.append(GetAllSubgroups_basedonlevel(subgroupcode, concordance, scheme=scheme))
            else:
                chunks.append(minichunk)
    chunks = pd.concat(chunks).drop_duplicates()
    chunks['Code'] = code
    return chunks


def GetAllSubgroups_basedonlevel(subgroupcode, concordance, scheme):
    # this part is useful for code such as B60K1
    # e.g, B60K2001/0405 is embedded in B60K1/04. so we can't just rely on the group level code to get all the subgroups (B60K1)
    # first, we get the index of the first row were the full code is the particular subgroupcode we're after
    # then we go down row by row, to find all the codes that are embedded.
    ind = concordance[concordance['{}full'.format(scheme)] == subgroupcode].index[0]
    list_ind = [ind]
    lev_base = concordance.loc[ind, 'level']
    ind += 1
    newlev = concordance.loc[ind, 'level']
    while newlev > lev_base:
        list_ind.append(ind)
        ind += 1
        newlev = concordance.loc[ind, 'level']
    chunk = concordance.loc[list_ind, :]
    return chunk


def Correct_ConflictingClassification(df_allsubgroups, codes_lit):
    # e.g., F23 is dirty but F23B10 is grey
    # as a result of getting all the subcodes under F23, F23B10 was added as a "dirty" code
    # here we correct the for this, i.e. F23B10 will be listed as grey and grey only, not dirty
    df_allsubgroups = df_allsubgroups.merge(codes_lit[['Code', 'SupCode_DifferentType']].drop_duplicates(), on='Code', how='left')
    df_lowerhighercode = df_allsubgroups[df_allsubgroups['SupCode_DifferentType'].notnull()][['Code', 'SupCode_DifferentType']].drop_duplicates()
    i, row = 0, df_lowerhighercode.iloc[0, :]
    for i, row in df_lowerhighercode.iterrows():
        lowercode = row['Code']
        uppercode = row['SupCode_DifferentType']
        mask1 = df_allsubgroups['Code'] == uppercode
        # Remove all the CPCfull codes included under the uppercode and that are also under the lowercode
        list_subgroups = df_allsubgroups[df_allsubgroups['Code'] == lowercode]['CPCfull'].tolist()
        mask2 = df_allsubgroups['CPCfull'].isin(list_subgroups)
        index_todrop = df_allsubgroups[mask1 & mask2].index
        df_allsubgroups = df_allsubgroups.drop(index_todrop)
        # Remove all the IPCfull codes included under the uppercode and that are also under the lowercode
        mask1 = df_allsubgroups['Code'] == uppercode
        list_subgroups = df_allsubgroups[df_allsubgroups['Code'] == lowercode]['IPCfull'].tolist()
        mask2 = df_allsubgroups['IPCfull'].isin(list_subgroups)
        index_todrop = df_allsubgroups[mask1 & mask2].index
        df_allsubgroups = df_allsubgroups.drop(index_todrop)
    df_allsubgroups = df_allsubgroups.reset_index().drop(['index'], axis=1)
    return df_allsubgroups


def main():
    LOGGER.info('SCRIPT: a_finding_all_CPCIPC_subgroups.py Collect all CPC and IPC subgroups that we want to capture (i.e. energy)')
    LOGGER.info('Collect all CPC and IPC subgroups that we want to capture (i.e. energy)')
    LOGGER.info('       Goal: Collect all CPC and IPC subgroups that we labeled energy')
    LOGGER.info('       Import IPC-CPC concordance and file that we have manually created:')
    LOGGER.info('       GreenCPC_IPC_codes_fromliterature.xlsx')
    concordance = import_concordance()
    codes_lit = pd.read_excel(PATHS.rawdata / 'GreenCodes/GreenCPC_IPC_codes_fromliterature.xlsx')
    codes_lit['Sub-sector'] = codes_lit['Sub-sector'].str.lower()
    # Keep only those we decided to include
    codes_lit = codes_lit[codes_lit['Include'] > 0]
    LOGGER.info('       Get all subgroup codes embedded under the codes of interest.')
    df_allsubgroups = GetAllSubGroupCodes(codes_lit, concordance)
    # Deal with classification conflicts
    df_allsubgroups = Correct_ConflictingClassification(df_allsubgroups, codes_lit)
    df_allsubgroups = codes_lit[['Scheme', 'Code', 'Type', 'Sector', 'Sub-sector', 'Include']].merge(df_allsubgroups, on='Code', how='right')
    df_allsubgroups = df_allsubgroups.drop(['SupCode_DifferentType'], axis=1)
    df_allsubgroups['Sub-sector'] = df_allsubgroups['Sub-sector'].str.strip()
    df_allsubgroups['Sector'] = df_allsubgroups['Sector'].str.strip()
    df_allsubgroups.to_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/CPC_IPC_codes_allsubgroups.csv', index=False)
    LOGGER.info('       Saved - CPC_IPC_codes_allsubgroups.csv')
    LOGGER.info('SCRIPT END: a_finding_all_CPCIPC_subgroups.py')
    return

